Wykorzystaj rmarkdown do zbudowania dokumentu html, z opisem poniższych analiz.
Wczytaj ten zbiór danych. Dla pierwszego zbuduj model pomiędzy wzrostem syna i ojca. Dla drugiego dla różnych genomów znajdują się w nim między innymi informacje o wielkości genomu oraz średnim współczynniku GC (udział zasad G lub C w genomie).
library(PBImisc)
library(ggplot2)
library(stringr)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
data(heights)
data(genomes)
help(heights)
Przedstaw graficznie zależność pomiędzy cechami. Sprawdź czy transformacja (np. logarytm) nie poprawi liniowości modelu.
Wyznacz model liniowy dla obu powyższych zależności używając funkcji lm().
Użyj funkcji plot() aby wyznaczyć wykresy diagnostyczne.
Użyj funkcji rstandard, rstudent, cooks.distance aby wyznaczyć reszty i miary wpływu.
heights[1:4, ]
## Husband Wife
## 1 186 175
## 2 180 168
## 3 160 154
## 4 186 166
ggplot(data=heights) + geom_histogram(aes(x=Husband))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=heights) + geom_histogram(aes(x=Wife))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hgt_lm <- lm(Wife~Husband, data=heights)
hgts_model <- cbind(heights, data.frame('var'=heights$Husband, 'obs'=heights$Wife, 'fit'=hgt_lm$fit, 'rstudent'=rstudent(hgt_lm), 'hat'=hatvalues(hgt_lm), 'cooks'=cooks.distance(hgt_lm)))
hgts_model$'ID' <- 1:nrow(hgts_model)
error_var <- sum((hgts_model$obs - hgts_model$fit)^2)/(nrow(hgts_model))
summary(hgt_lm)
##
## Call:
## lm(formula = Wife ~ Husband, data = heights)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.4685 -3.9208 0.8301 3.9538 11.1287
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.93015 10.66162 3.933 0.000161 ***
## Husband 0.69965 0.06106 11.458 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.928 on 94 degrees of freedom
## Multiple R-squared: 0.5828, Adjusted R-squared: 0.5783
## F-statistic: 131.3 on 1 and 94 DF, p-value: < 2.2e-16
ggplot(data=hgts_model) + geom_point(aes(x=var, y=obs)) + geom_abline(intercept=hgt_lm$coef[1], slope=hgt_lm$coef[2], col="red")
The slope and intercept are statistically significant.
qplot(hgts_model$fit - hgts_model$obs, geom="histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
qplot(hgts_model$rstudent, geom="histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
residual.data <- data.frame('res'=sort(hgts_model$rstudent))
residual.data$'emp.cdf' <- 1:nrow(residual.data)/nrow(residual.data)
residual.data$'th.quant' <- qnorm(residual.data$'emp.cdf')
ggplot(data=residual.data, aes(x=th.quant, y=res)) + geom_point() + geom_abline(col='red')
qqnorm(hgts_model$rstudent)
The residues have a fairly normal composition, but are skewed - the distribution of wifes’ height is more dispersed for lower heights.
ggplot(data=hgts_model) + geom_text(aes(x=hat, y=rstudent, label=ID)) + geom_abline(slope=0, col='red') + ggtitle("Residues vs leverages")
No significantly deviant observations with high leverage; Observations 39 and 72 a little suspicious.
ggplot(data=hgts_model) + geom_text(aes(x=fit, y=cooks, label=ID))
ggplot(data=hgts_model) + geom_text(aes(x=rstudent, y=cooks, label=ID))
ggplot(data=hgts_model) + geom_text(aes(x=hat, y=cooks, label=ID))
Another potential outlier is the observation 12.
hgts_model$"Outlier" = factor(0, levels=c(0, 1))
hgts_model$"Outlier"[c(12, 72, 39)] = 1
ggplot(data=hgts_model) + geom_text(aes(x=var, y=obs, col=Outlier, label=ID)) + geom_abline(intercept=hgt_lm$coef[1], slope=hgt_lm$coef[2], col="red")
Observation 12 is a potential outlier, but since it’s surrounded by many observations it does not influence the model.
hgts_model[12,c("Husband", "Wife")]
## Husband Wife
## 12 178 147
data(genomes)
names(genomes)
## [1] "organism" "group" "size" "GC" "habitat"
## [6] "temp.group" "temperature"
#genomes$size <- genomes$size - mean(genomes$size)
genomes$size <- genomes$size/sd(genomes$size)
#genomes$GC <- genomes$GC - mean(genomes$GC)
genomes$GC <- genomes$GC/sd(genomes$GC)
qplot(genomes$size, genomes$GC)
qplot(log(genomes$size), genomes$GC)
qplot(sqrt(genomes$size), genomes$GC)
qplot(genomes$size, sqrt(genomes$GC))
Logarithmic and square root transformations of independent variable seem best. Taking square root of GC doesn’t change much so I’ll pass on it.
genome_lm <- lm(GC~size, data=genomes)
log_lm <- lm(GC~log(size), data=genomes)
sqrt_lm <- lm(GC~sqrt(size), data=genomes)
double_sqrt_lm <- lm(GC~sqrt(sqrt(size)), data=genomes) # this transform was inspired by the diagnostic plots, which showed that the square root transform performed better than the log transform
models <- list('raw.size'=genome_lm, 'log.size'=log_lm, 'sqrt.size'=sqrt_lm, 'double_sqrt.size'=double_sqrt_lm)
genome_summaries <- lapply(models, function(m) as.data.frame(cbind("org"=factor(genomes$organism), "group"=factor(genomes$group), "size"=genomes$size, "GC"=genomes$GC, predict(m, interval="confidence"), 'hat'=hatvalues(m), 'res'=rstudent(m))))
genome_RSS <- lapply(models, function(m) sum(m$residuals^2))
for(model_name in names(models)){
p <- ggplot(data=genome_summaries[[model_name]]) +geom_point(aes(x=size, y=GC, col=group)) + geom_line(aes(x=size, y=fit), col="red") + geom_ribbon(aes(x=size, ymin=lwr, ymax=upr), alpha=0.1, fill='orange') + ggtitle(model_name, subtitle=str_c("RSS = ", genome_RSS[[model_name]]))
show(p)
}
The best fit (lowest RSS) is for the double square (i.e. fourth) root of the genome size. However, different groups seem to exhibit different dependencies, which suggests taking some test to check this. But first, let’s do some diagnostics:
for(n in names(models)){
plot(models[[n]], sub.caption = n)
}
The dependence of residuals on fitted values is most linear for the square root and the double square root transform, which suggest that they are better than the log transform (even thoug the log transform has lower RSS than the square root transform). The variance seems to be heteroskedastic, being the largest for medium genome sizes (which may be because those genomes are probably the most common ones). There is one rather influential outlier for non-transformed data, which vanishes after taking any transformation.
for(n in names(models)){
print(n)
show(bptest(models[[n]]))
}
## [1] "raw.size"
##
## studentized Breusch-Pagan test
##
## data: models[[n]]
## BP = 3.4043, df = 1, p-value = 0.06503
##
## [1] "log.size"
##
## studentized Breusch-Pagan test
##
## data: models[[n]]
## BP = 17.207, df = 1, p-value = 3.352e-05
##
## [1] "sqrt.size"
##
## studentized Breusch-Pagan test
##
## data: models[[n]]
## BP = 8.0361, df = 1, p-value = 0.004585
##
## [1] "double_sqrt.size"
##
## studentized Breusch-Pagan test
##
## data: models[[n]]
## BP = 12.451, df = 1, p-value = 0.0004179
The variance is indeed heteroskedastic (except maybe for non-transformed genome size, but it’s not a good model).
for(n in names(models)){
print(n)
show(gqtest(models[[n]], point=0.3, alternative="less"))
}
## [1] "raw.size"
##
## Goldfeld-Quandt test
##
## data: models[[n]]
## GQ = 0.63184, df1 = 505, df2 = 215, p-value = 1.984e-05
## alternative hypothesis: variance decreases from segment 1 to 2
##
## [1] "log.size"
##
## Goldfeld-Quandt test
##
## data: models[[n]]
## GQ = 0.625, df1 = 505, df2 = 215, p-value = 1.284e-05
## alternative hypothesis: variance decreases from segment 1 to 2
##
## [1] "sqrt.size"
##
## Goldfeld-Quandt test
##
## data: models[[n]]
## GQ = 0.62035, df1 = 505, df2 = 215, p-value = 9.472e-06
## alternative hypothesis: variance decreases from segment 1 to 2
##
## [1] "double_sqrt.size"
##
## Goldfeld-Quandt test
##
## data: models[[n]]
## GQ = 0.6204, df1 = 505, df2 = 215, p-value = 9.501e-06
## alternative hypothesis: variance decreases from segment 1 to 2
After one-third of the observations the variance decreases (but not after one-fifth, i.e. for point=0.2).
for(n in names(models)){
print(n)
show(hmctest(models[[n]], point=0.5))
}
## [1] "raw.size"
##
## Harrison-McCabe test
##
## data: models[[n]]
## HMC = 0.59011, p-value = 0.998
##
## [1] "log.size"
##
## Harrison-McCabe test
##
## data: models[[n]]
## HMC = 0.59773, p-value = 1
##
## [1] "sqrt.size"
##
## Harrison-McCabe test
##
## data: models[[n]]
## HMC = 0.59844, p-value = 0.999
##
## [1] "double_sqrt.size"
##
## Harrison-McCabe test
##
## data: models[[n]]
## HMC = 0.59936, p-value = 0.999
This test checks which fraction of RSS is generated by a given fraction of data, so it basically checks for a trend in residual variance. This won’t work here because the variance of residues is more-or-less symmetric.
for(n in names(models)){
print(n)
show(dwtest(models[[n]], alternative = "greater"))
}
## [1] "raw.size"
##
## Durbin-Watson test
##
## data: models[[n]]
## DW = 0.90326, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
##
## [1] "log.size"
##
## Durbin-Watson test
##
## data: models[[n]]
## DW = 0.84417, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
##
## [1] "sqrt.size"
##
## Durbin-Watson test
##
## data: models[[n]]
## DW = 0.87276, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
##
## [1] "double_sqrt.size"
##
## Durbin-Watson test
##
## data: models[[n]]
## DW = 0.85763, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
The autocorrelation is greater than 0. Check:
cor(double_sqrt_lm$residuals[-1], double_sqrt_lm$residuals[-length(sqrt_lm$residuals)])
## [1] 0.5703214
The residues increase. That’s strange because the data doesn’t seem to be ordered. Check if this autocorrelation is not an artifact:
shuffle <- sample(1:724, 724, replace=F)
cor(double_sqrt_lm$residuals[shuffle][-1], double_sqrt_lm$residuals[shuffle][-length(sqrt_lm$residuals)])
## [1] -0.01167001
There is no autocorrelation in shuffled data, as it should be, so the apparent autocorrelation is not a bug. The data might be ordered somehow, but at first glance I don’t see any ordering.
qplot(1:724, double_sqrt_lm$residuals)
for(n in names(models)){
print(n)
show(bgtest(models[[n]], order = 5))
}
## [1] "raw.size"
##
## Breusch-Godfrey test for serial correlation of order up to 5
##
## data: models[[n]]
## LM test = 225.3, df = 5, p-value < 2.2e-16
##
## [1] "log.size"
##
## Breusch-Godfrey test for serial correlation of order up to 5
##
## data: models[[n]]
## LM test = 251.97, df = 5, p-value < 2.2e-16
##
## [1] "sqrt.size"
##
## Breusch-Godfrey test for serial correlation of order up to 5
##
## data: models[[n]]
## LM test = 238.47, df = 5, p-value < 2.2e-16
##
## [1] "double_sqrt.size"
##
## Breusch-Godfrey test for serial correlation of order up to 5
##
## data: models[[n]]
## LM test = 245.48, df = 5, p-value < 2.2e-16
for(n in names(models)){
print(n)
show(harvtest(models[[n]], order.by=~size, data=genomes))
}
## [1] "raw.size"
##
## Harvey-Collier test
##
## data: models[[n]]
## HC = 4.2988, df = 721, p-value = 1.953e-05
##
## [1] "log.size"
##
## Harvey-Collier test
##
## data: models[[n]]
## HC = 1.1855, df = 721, p-value = 0.2362
##
## [1] "sqrt.size"
##
## Harvey-Collier test
##
## data: models[[n]]
## HC = 1.7856, df = 721, p-value = 0.07459
##
## [1] "double_sqrt.size"
##
## Harvey-Collier test
##
## data: models[[n]]
## HC = 0.35363, df = 721, p-value = 0.7237
The relationship tured out to be linear for every transformation but not for un-transformed data, as should be expected.
fract = 0.3
centr = 0.8
for(n in names(models)){
print(n)
show(raintest(models[[n]], fraction=fract, center=centr, order.by=~size, data=genomes))
}
## [1] "raw.size"
##
## Rainbow test
##
## data: models[[n]]
## Rain = 0.94989, df1 = 507, df2 = 215, p-value = 0.6784
##
## [1] "log.size"
##
## Rainbow test
##
## data: models[[n]]
## Rain = 0.90224, df1 = 507, df2 = 215, p-value = 0.8198
##
## [1] "sqrt.size"
##
## Rainbow test
##
## data: models[[n]]
## Rain = 0.90592, df1 = 507, df2 = 215, p-value = 0.8103
##
## [1] "double_sqrt.size"
##
## Rainbow test
##
## data: models[[n]]
## Rain = 0.8982, df1 = 507, df2 = 215, p-value = 0.83
I don’t think the rainbow test will do well in this case, because the data is basically “locally linear”, and highly spread for medium genome sizes. This means that after restriction to a fraction of data either each model will fit better (after restriction to small or large genome sizes), or there will be no substantial change (after restriction to medium genome sizes)
test.data <- data.frame('size'=genomes$size, 'GC'=genomes$GC)
test.data <- test.data[as.integer(centr*724 - fract*724*0.5):(centr*724 + fract*724*0.5), ]
test.data <- test.data[order(test.data$size), ]
ggplot(data=genomes) + geom_point(aes(x=size, y=GC))
ggplot(data=test.data) + geom_point(aes(x=size, y=GC))
for(n in names(models)){
print(n)
show(resettest(models[[n]]))
}
## [1] "raw.size"
##
## RESET test
##
## data: models[[n]]
## RESET = 10.703, df1 = 2, df2 = 720, p-value = 2.628e-05
##
## [1] "log.size"
##
## RESET test
##
## data: models[[n]]
## RESET = 1.9135, df1 = 2, df2 = 720, p-value = 0.1483
##
## [1] "sqrt.size"
##
## RESET test
##
## data: models[[n]]
## RESET = 2.2425, df1 = 2, df2 = 720, p-value = 0.1069
##
## [1] "double_sqrt.size"
##
## RESET test
##
## data: models[[n]]
## RESET = 0.64246, df1 = 2, df2 = 720, p-value = 0.5263
The first model is ill-specified, as should be expected. The others are ok. The results are in agreement with those of the Harvey-Collier test.
for(n in names(models)){
print(n)
show(shapiro.test(rstandard(models[[n]])))
}
## [1] "raw.size"
##
## Shapiro-Wilk normality test
##
## data: rstandard(models[[n]])
## W = 0.98816, p-value = 1.372e-05
##
## [1] "log.size"
##
## Shapiro-Wilk normality test
##
## data: rstandard(models[[n]])
## W = 0.99189, p-value = 0.0005388
##
## [1] "sqrt.size"
##
## Shapiro-Wilk normality test
##
## data: rstandard(models[[n]])
## W = 0.99193, p-value = 0.0005609
##
## [1] "double_sqrt.size"
##
## Shapiro-Wilk normality test
##
## data: rstandard(models[[n]])
## W = 0.99261, p-value = 0.001176
for(n in names(models)){
print(n)
show(ks.test(rstandard(models[[n]]), pnorm))
}
## [1] "raw.size"
##
## One-sample Kolmogorov-Smirnov test
##
## data: rstandard(models[[n]])
## D = 0.05521, p-value = 0.02422
## alternative hypothesis: two-sided
##
## [1] "log.size"
##
## One-sample Kolmogorov-Smirnov test
##
## data: rstandard(models[[n]])
## D = 0.05711, p-value = 0.01778
## alternative hypothesis: two-sided
##
## [1] "sqrt.size"
##
## One-sample Kolmogorov-Smirnov test
##
## data: rstandard(models[[n]])
## D = 0.052789, p-value = 0.03537
## alternative hypothesis: two-sided
##
## [1] "double_sqrt.size"
##
## One-sample Kolmogorov-Smirnov test
##
## data: rstandard(models[[n]])
## D = 0.053754, p-value = 0.03048
## alternative hypothesis: two-sided
Both tests rejected the hypothesis that the residues are normally distributed. This may be caused by heteroskedasticity.
for(n in names(models)){
print(n)
show(qplot(rstandard(models[[n]])))
}
## [1] "raw.size"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "log.size"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "sqrt.size"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "double_sqrt.size"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
h_lm <- lm(GC ~ sqrt(sqrt(size)) + group, data=genomes)
summary(h_lm)
##
## Call:
## lm(formula = GC ~ sqrt(sqrt(size)) + group, data = genomes)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.51270 -0.38227 0.00924 0.36254 2.15650
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.346268 0.635538 0.545 0.58604
## sqrt(sqrt(size)) 2.882327 0.163052 17.677 < 2e-16 ***
## groupActinobacteria 1.088543 0.592758 1.836 0.06672 .
## groupAlphaproteobacteria 0.482474 0.591827 0.815 0.41522
## groupAquificae -0.340001 0.681919 -0.499 0.61822
## groupBacteroidetes/Chlorobi -0.097822 0.600945 -0.163 0.87074
## groupBetaproteobacteria 0.762479 0.592348 1.287 0.19844
## groupChlamydiae/Verrucomicrobia 0.136177 0.616029 0.221 0.82511
## groupChloroflexi 0.329485 0.634526 0.519 0.60374
## groupCrenarchaeota 0.373607 0.609124 0.613 0.53984
## groupCyanobacteria 0.020887 0.599069 0.035 0.97220
## groupDeinococcus-Thermus 1.706968 0.658326 2.593 0.00972 **
## groupDeltaproteobacteria 0.569864 0.601659 0.947 0.34389
## groupEpsilonproteobacteria -0.424194 0.604703 -0.701 0.48323
## groupEuryarchaeota 0.087597 0.598706 0.146 0.88372
## groupFirmicutes -0.653009 0.591869 -1.103 0.27028
## groupFusobacteria -1.256735 0.831680 -1.511 0.13122
## groupGammaproteobacteria -0.168401 0.589364 -0.286 0.77517
## groupNanoarchaeota 0.005401 0.838256 0.006 0.99486
## groupOther Bacteria -0.139052 0.619192 -0.225 0.82238
## groupPlanctomycetes -0.136672 0.828215 -0.165 0.86898
## groupSpirochaetes -0.582370 0.608795 -0.957 0.33910
## groupThermotogae -0.226928 0.631363 -0.359 0.71938
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5855 on 701 degrees of freedom
## Multiple R-squared: 0.6677, Adjusted R-squared: 0.6572
## F-statistic: 64.01 on 22 and 701 DF, p-value: < 2.2e-16
lrtest(h_lm, double_sqrt_lm)
## Likelihood ratio test
##
## Model 1: GC ~ sqrt(sqrt(size)) + group
## Model 2: GC ~ sqrt(sqrt(size))
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 24 -628.03
## 2 3 -843.29 -21 430.52 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The difference in groups is significant.
groups <- unique(genomes$group)
gr_models <- list()
for(g in 1:length(groups)){
d <- genomes[genomes$group == groups[[g]], ]
gr_models[[g]] <- lm(GC~log(size), data=d, x=T)
}
gr_fit <- sapply(1:length(groups), function(g) gr_models[[g]]$fit)
gr_fit <- unlist(gr_fit)
gr_fit <- gr_fit[order(as.numeric(names(gr_fit)))]
gr_models_summary <- genomes[,c("size", "GC", "group")]
gr_models_summary$fit <- gr_fit
ggplot(data=gr_models_summary) + geom_point(aes(col=group, x=size, y=GC)) + geom_line(aes(x=size, y=fit, col=group))
The relationships are very different for different groups of organisms (sometimes even inverse), suggesting no universal biological connection between genome size and GC content.
gr_nb = 13
groups[[gr_nb]]
## [1] Bacteroidetes/Chlorobi
## 22 Levels: Acidobacteria Actinobacteria Alphaproteobacteria ... Thermotogae
gr_models[[gr_nb]]
##
## Call:
## lm(formula = GC ~ log(size), data = d, x = T)
##
## Coefficients:
## (Intercept) log(size)
## 4.0883 -0.9068
d <- genomes[genomes$group==groups[[gr_nb]], c("size", "GC")]
d$fit <- gr_models[[gr_nb]]$fitted.values
d$ID <- rownames(d)
ggplot(data=d) + geom_text(aes(x=size, y=GC, label=ID)) + geom_line(aes(x=size, y=fit))
plot(gr_models[[gr_nb]])